Biostat 203B Homework 3

Due Mar 30, 2024 @ 11:59PM

Author

Lingyi Zhang and 606332255

Display machine information:

sessionInfo()
R version 4.3.1 (2023-06-16)
Platform: aarch64-apple-darwin20 (64-bit)
Running under: macOS Big Sur 11.7.6

Matrix products: default
BLAS:   /Library/Frameworks/R.framework/Versions/4.3-arm64/Resources/lib/libRblas.0.dylib 
LAPACK: /Library/Frameworks/R.framework/Versions/4.3-arm64/Resources/lib/libRlapack.dylib;  LAPACK version 3.11.0

locale:
[1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8

time zone: America/Los_Angeles
tzcode source: internal

attached base packages:
[1] stats     graphics  grDevices utils     datasets  methods   base     

loaded via a namespace (and not attached):
 [1] compiler_4.3.1    fastmap_1.1.1     cli_3.6.1         tools_4.3.1      
 [5] htmltools_0.5.7   rstudioapi_0.15.0 yaml_2.3.8        rmarkdown_2.25   
 [9] knitr_1.45        jsonlite_1.8.7    xfun_0.41         digest_0.6.34    
[13] rlang_1.1.1       evaluate_0.23    

Load necessary libraries (you can add more as needed).

library(arrow)
library(gtsummary)
library(memuse)
library(pryr)
library(R.utils)
library(tidyverse)
library(lubridate)

Display my machine memory.

memuse::Sys.meminfo()
Totalram:    8.000 GiB 
Freeram:   801.703 MiB 

In this exercise, we use tidyverse (ggplot2, dplyr, etc) to explore the MIMIC-IV data introduced in homework 1 and to build a cohort of ICU stays.

Q1. Visualizing patient trajectory

Visualizing a patient’s encounters in a health care system is a common task in clinical data analysis. In this question, we will visualize a patient’s ADT (admission-discharge-transfer) history and ICU vitals in the MIMIC-IV data. ### Q1.1 ADT history A patient’s ADT history records the time of admission, discharge, and transfer in the hospital. This figure shows the ADT history of the patient with subject_id 10001217 in the MIMIC-IV data. The x-axis is the calendar time, and the y-axis is the type of event (ADT, lab, procedure). The color of the line segment represents the care unit. The size of the line segment represents whether the care unit is an ICU/CCU. The crosses represent lab events, and the shape of the dots represents the type of procedure. The title of the figure shows the patient’s demographic information and the subtitle shows top 3 diagnoses.

Do a similar visualization for the patient with subject_id 10013310 using ggplot.

Hint: We need to pull information from data files patients.csv.gz, admissions.csv.gz, transfers.csv.gz, labevents.csv.gz, procedures_icd.csv.gz, diagnoses_icd.csv.gz, d_icd_procedures.csv.gz, and d_icd_diagnoses.csv.gz. For the big file labevents.csv.gz, use the Parquet format you generated in Homework 2. For reproducibility, make the Parquet folder labevents_pq available at the current working directory hw3, for example, by a symbolic link. Make your code reproducible.

# Data directory
setwd("~/mimic/hosp")

# Load
patients <- read_csv("patients.csv.gz")
admissions <- read_csv("admissions.csv.gz")
transfers <- read_csv("transfers.csv.gz")
labevents <- open_dataset("labevents_pq", format = "parquet")
procedures_icd <- read_csv("procedures_icd.csv.gz")
diagnoses_icd <- read_csv("diagnoses_icd.csv.gz")
d_icd_procedures <- read_csv("d_icd_procedures.csv.gz")
d_icd_diagnoses <- read_csv("d_icd_diagnoses.csv.gz")

# Filter
patient_id <- 10013310

patient_data <- patients %>% filter(subject_id == patient_id)
admission_data <- admissions %>% filter(subject_id == patient_id)
transfer_data <- transfers %>% filter(subject_id == patient_id)
lab_data <- labevents %>%
  filter(subject_id == patient_id)%>%
  collect()
procedure_data <- procedures_icd %>% filter(subject_id == patient_id)
diagnosis_data <- diagnoses_icd %>% filter(subject_id == patient_id)

# Transform time and factor seq_num
procedure_data$chartdate <- as.POSIXct(procedure_data$chartdate, format = "%Y/%m/%d", tz = "UTC")
procedure_data$seq_num <- factor(procedure_data$seq_num)

# ADT 
# Filter out discharge events as they do not have an outtime
adt_data <- transfer_data %>% filter(eventtype != 'discharge')

shapes <- c(15, 16, 17, 18, 19, 20, 21)
set.seed(123)
unique_types <- unique(procedure_data$seq_num)
shapes_sampled <- sample(shapes, length(unique_types))
shape_mapping <- setNames(shapes_sampled, unique_types)

# title
formatted_title <- sprintf("Patient: %s, %s, %s years old, %s ", patient_data$subject_id[1], patient_data$gender[1], patient_data$anchor_age[1],admission_data$race[1])

# subtitle
joined_df <- left_join(diagnosis_data, d_icd_diagnoses, by = c("icd_code"))
top_long_titles <- joined_df %>%
  select(long_title) %>%
  slice(1:3)
formatted_subtitle <- paste(top_long_titles$long_title, collapse = "\n")

p <- ggplot() +
  labs(title = formatted_title)+
  labs(subtitle = formatted_subtitle)+
  geom_segment(data = adt_data, aes(x = intime, xend = outtime, y = 3, yend = 3, colour = careunit), size = ifelse(grepl("CU", adt_data$careunit), 4, 1)) +
  geom_point(data = lab_data , aes(x = charttime, y = 2), shape = 3, size = 3) +
  geom_point(data = procedure_data, aes(x = chartdate, y = 1, shape = seq_num),colour = "black", size = 3) +
  scale_shape_manual(values = shape_mapping) +
  scale_x_datetime(name = "Calender Time") +
  scale_y_continuous(name = "", limits = c(0, 4), breaks = c(1, 2, 3), labels = c("Procedure", "Lab", "ADT")) + 
  scale_color_discrete(name = "Care Unit")+
  scale_shape_discrete(name = "Procedure")+
  theme(
  panel.border = element_rect(color = "black", size = 1, fill = NA), 
  legend.box = "vertical"
  )+
   theme(axis.text.y = element_text(), axis.ticks.y = element_line())
Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
ℹ Please use `linewidth` instead.
Warning: The `size` argument of `element_rect()` is deprecated as of ggplot2 3.4.0.
ℹ Please use the `linewidth` argument instead.
print(p)

Q1.2 ICU stays

ICU stays are a subset of ADT history. This figure shows the vitals of the patient 10001217 during ICU stays. The x-axis is the calendar time, and the y-axis is the value of the vital. The color of the line represents the type of vital. The facet grid shows the abbreviation of the vital and the stay ID.

Do a similar visualization for the patient 10013310.

The file is too big, so pre-process the data and load.

setwd("~/mimic/icu") 
patient_id <- 10013310
chartevents_dataset <- open_dataset("chartevents.csv", format = "csv")
chart_events <- chartevents_dataset %>% filter(subject_id == patient_id)
chartevents_filtered <- chart_events %>%
  filter(itemid %in% c(220045, 220179, 220180, 223761, 220210)) %>%
  collect()

write.csv(chartevents_filtered,"chartevents_filtered.csv")
setwd("~/mimic/icu") 
# read data- this is a filterd version, subject_id
chart_events <- read_csv("chartevents_filtered.csv")
patient_id <- 10013310
chart_events <- chart_events %>% filter(subject_id == patient_id)
chart_events <- chart_events %>% 
  filter(itemid %in% c(220045, 220179, 220180, 223761, 220210))

formatted_title <- sprintf("Patient: %s ICU stays - Vitals", patient_id)

chart_events$item_label <- factor(chart_events$itemid, 
                        levels = c("220045","220179","220180","220210","223761"), 
                        labels = c("HR", "NBPd", "NBPs", "RR", "Temperature F"))

p <- ggplot(chart_events, aes(x=charttime, y=value, color=factor(itemid))) + 
  geom_line() + 
  geom_point(size=1.2) +
  facet_grid(item_label ~ stay_id, scales = "free_x") + 
  theme_minimal() + 
  labs(x="", y="", title=formatted_title) +
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1,size=12, face="bold"),
    panel.background = element_rect( colour = "black"),
    strip.text.x = element_text(color="white", face="bold", size=12),
    strip.text.y = element_text(color="white", face="bold", size=12),
    strip.background = element_rect(fill="grey60", colour="NA", size=1)
  )+
  scale_color_manual(
    values=c("#E77F71", "#A2A52F", "#55BC82", "#54ADF1", "#D972ED"),
    guide = FALSE
  )

# 打印图表
print(p)
Warning: The `guide` argument in `scale_*()` cannot be `FALSE`. This was deprecated in
ggplot2 3.3.4.
ℹ Please use "none" instead.

Q2. ICU stays

icustays.csv.gz (https://mimic.mit.edu/docs/iv/modules/icu/icustays/) contains data about Intensive Care Units (ICU) stays. The first 10 lines are

zcat < ~/mimic/icu/icustays.csv.gz | head
subject_id,hadm_id,stay_id,first_careunit,last_careunit,intime,outtime,los
10000032,29079034,39553978,Medical Intensive Care Unit (MICU),Medical Intensive Care Unit (MICU),2180-07-23 14:00:00,2180-07-23 23:50:47,0.4102662037037037
10000980,26913865,39765666,Medical Intensive Care Unit (MICU),Medical Intensive Care Unit (MICU),2189-06-27 08:42:00,2189-06-27 20:38:27,0.4975347222222222
10001217,24597018,37067082,Surgical Intensive Care Unit (SICU),Surgical Intensive Care Unit (SICU),2157-11-20 19:18:02,2157-11-21 22:08:00,1.1180324074074075
10001217,27703517,34592300,Surgical Intensive Care Unit (SICU),Surgical Intensive Care Unit (SICU),2157-12-19 15:42:24,2157-12-20 14:27:41,0.9481134259259258
10001725,25563031,31205490,Medical/Surgical Intensive Care Unit (MICU/SICU),Medical/Surgical Intensive Care Unit (MICU/SICU),2110-04-11 15:52:22,2110-04-12 23:59:56,1.338587962962963
10001884,26184834,37510196,Medical Intensive Care Unit (MICU),Medical Intensive Care Unit (MICU),2131-01-11 04:20:05,2131-01-20 08:27:30,9.171817129629629
10002013,23581541,39060235,Cardiac Vascular Intensive Care Unit (CVICU),Cardiac Vascular Intensive Care Unit (CVICU),2160-05-18 10:00:53,2160-05-19 17:33:33,1.3143518518518518
10002155,20345487,32358465,Medical Intensive Care Unit (MICU),Medical Intensive Care Unit (MICU),2131-03-09 21:33:00,2131-03-10 18:09:21,0.8585763888888889
10002155,23822395,33685454,Coronary Care Unit (CCU),Coronary Care Unit (CCU),2129-08-04 12:45:00,2129-08-10 17:02:38,6.178912037037037

Q2.1 Ingestion

Import icustays.csv.gz as a tibble icustays_tble.

icustays_tble <- read_csv("~/mimic/icu/icustays.csv.gz")

Q2.2 Summary and visualization

# Count unique subject_ids
num_unique_subjects <- icustays_tble %>%
  distinct(subject_id) %>%
  nrow()
print(paste("Number of unique subject_ids:", num_unique_subjects))
[1] "Number of unique subject_ids: 50920"
# Check for multiple ICU stays per subject_id
icu_stays_per_subject <- icustays_tble %>%
  group_by(subject_id) %>%
  summarise(num_icu_stays = n()) %>%
  ungroup()

# View a summary of ICU stays per subject
summary(icu_stays_per_subject$num_icu_stays)
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
  1.000   1.000   1.000   1.437   1.000  37.000 
# Visualization of the number of ICU stays per subject_id
ggplot(icu_stays_per_subject, aes(x = num_icu_stays)) +
  geom_histogram(binwidth = 1, fill = "#55BC82", color = "black") +
  labs(title = "Distribution of ICU Stays per Subject",
       x = "Number of ICU Stays",
       y = "Count of Subjects") +
  theme_minimal()

# To see if any subject_id has multiple ICU stays
multiple_icu_stays <- icu_stays_per_subject %>%
  filter(num_icu_stays > 1) %>%
  nrow()

print(paste("Number of subjects with multiple ICU stays:", multiple_icu_stays))
[1] "Number of subjects with multiple ICU stays: 12448"

The first block counts the number of unique subject_ids. The second block aggregates the data by subject_id, counting the number of ICU stays for each. The third block creates a histogram visualizing how many subjects have different counts of ICU stays. The last block counts how many subjects had more than one ICU stay. As a result, a subject_id can have multiple ICU stays.

Q3. admissions data

Information of the patients admitted into hospital is available in admissions.csv.gz. See https://mimic.mit.edu/docs/iv/modules/hosp/admissions/ for details of each field in this file. The first 10 lines are:

zcat < ~/mimic/hosp/admissions.csv.gz | head
subject_id,hadm_id,admittime,dischtime,deathtime,admission_type,admit_provider_id,admission_location,discharge_location,insurance,language,marital_status,race,edregtime,edouttime,hospital_expire_flag
10000032,22595853,2180-05-06 22:23:00,2180-05-07 17:15:00,,URGENT,P874LG,TRANSFER FROM HOSPITAL,HOME,Other,ENGLISH,WIDOWED,WHITE,2180-05-06 19:17:00,2180-05-06 23:30:00,0
10000032,22841357,2180-06-26 18:27:00,2180-06-27 18:49:00,,EW EMER.,P09Q6Y,EMERGENCY ROOM,HOME,Medicaid,ENGLISH,WIDOWED,WHITE,2180-06-26 15:54:00,2180-06-26 21:31:00,0
10000032,25742920,2180-08-05 23:44:00,2180-08-07 17:50:00,,EW EMER.,P60CC5,EMERGENCY ROOM,HOSPICE,Medicaid,ENGLISH,WIDOWED,WHITE,2180-08-05 20:58:00,2180-08-06 01:44:00,0
10000032,29079034,2180-07-23 12:35:00,2180-07-25 17:55:00,,EW EMER.,P30KEH,EMERGENCY ROOM,HOME,Medicaid,ENGLISH,WIDOWED,WHITE,2180-07-23 05:54:00,2180-07-23 14:00:00,0
10000068,25022803,2160-03-03 23:16:00,2160-03-04 06:26:00,,EU OBSERVATION,P51VDL,EMERGENCY ROOM,,Other,ENGLISH,SINGLE,WHITE,2160-03-03 21:55:00,2160-03-04 06:26:00,0
10000084,23052089,2160-11-21 01:56:00,2160-11-25 14:52:00,,EW EMER.,P6957U,WALK-IN/SELF REFERRAL,HOME HEALTH CARE,Medicare,ENGLISH,MARRIED,WHITE,2160-11-20 20:36:00,2160-11-21 03:20:00,0
10000084,29888819,2160-12-28 05:11:00,2160-12-28 16:07:00,,EU OBSERVATION,P63AD6,PHYSICIAN REFERRAL,,Medicare,ENGLISH,MARRIED,WHITE,2160-12-27 18:32:00,2160-12-28 16:07:00,0
10000108,27250926,2163-09-27 23:17:00,2163-09-28 09:04:00,,EU OBSERVATION,P38XXV,EMERGENCY ROOM,,Other,ENGLISH,SINGLE,WHITE,2163-09-27 16:18:00,2163-09-28 09:04:00,0
10000117,22927623,2181-11-15 02:05:00,2181-11-15 14:52:00,,EU OBSERVATION,P2358X,EMERGENCY ROOM,,Other,ENGLISH,DIVORCED,WHITE,2181-11-14 21:51:00,2181-11-15 09:57:00,0

Q3.1 Ingestion

Import admissions.csv.gz as a tibble admissions_tble.

admissions_tble <- read_csv("~/mimic/hosp/admissions.csv.gz")

Q3.2 Summary and visualization

Summarize the following information by graphics and explain any patterns you see.

  • number of admissions per patient
  • admission hour (anything unusual?)
  • admission minute (anything unusual?)
  • length of hospital stay (from admission to discharge) (anything unusual?)

According to the MIMIC-IV documentation, > All dates in the database have been shifted to protect patient confidentiality. Dates will be internally consistent for the same patient, but randomly distributed in the future. Dates of birth which occur in the present time are not true dates of birth. Furthermore, dates of birth which occur before the year 1900 occur if the patient is older than 89. In these cases, the patient’s age at their first admission has been fixed to 300.

# 1. Number of admissions per patient
admissions_per_patient <- admissions_tble %>%
  count(subject_id) %>%
  ggplot(aes(x=n)) +
  geom_histogram(binwidth=1, fill="#D972ED", color="black") +
  labs(title="Number of Admissions per Patient",
       x="Number of Admissions",
       y="Count of Patients") +
  theme_minimal()
print(admissions_per_patient)

We see that the distribution of the number of admissions per patient is very uneven. The majority of the population, about 0 to 20 times, accounts for a large proportion, indicating that only a few people in the population need frequent hospital visits. For those with more than 20 admissions, they are basically a very small number of people, but there are still cases ranging from 20 to over 200 times, indicating that they may be potential severe disease patients.

# 2. Admission Hour
admissions_tble$admission_hour <- hour(ymd_hms(admissions_tble$admittime))
Warning: 10925 failed to parse.
admission_hour_distribution <- ggplot(admissions_tble, aes(x=admission_hour)) +
  geom_histogram(binwidth=1, fill="#54ADF1", color="black") +
  labs(title="Admission Hour Distribution",
       x="Hour of Day",
       y="Admission Count") +
  theme_minimal()+
  xlim(c(0, 23))
print(admission_hour_distribution)
Warning: Removed 10925 rows containing non-finite values (`stat_bin()`).
Warning: Removed 2 rows containing missing values (`geom_bar()`).

For the distribution of the number of people by hour, it can be observed that there are obvious troughs in the early morning and at noon, while there are peaks from afternoon to evening. This indicates that people generally go to the hospital in the afternoon or evening, suggesting that they may need to work during the day or that the hospital has arranged for meals/rest at noon. The early morning period corresponds to when people are sleeping.

# 3. Admission Minute
admissions_tble$admission_minute <- minute(ymd_hms(admissions_tble$admittime))
Warning: 10925 failed to parse.
admission_minute_distribution <- ggplot(admissions_tble, aes(x=admission_minute)) +
  geom_histogram(binwidth=1, fill="coral", color="black") +
  labs(title="Admission Minute Distribution",
       x="Minute of Hour",
       y="Admission Count") +
  theme_minimal()+
  xlim(c(0, 59))
print(admission_minute_distribution)
Warning: Removed 10925 rows containing non-finite values (`stat_bin()`).
Warning: Removed 2 rows containing missing values (`geom_bar()`).

In theory, the number of people entering the hospital within an hour should be relatively evenly distributed. However, there are three extreme values in this distribution. It can be considered whether there is data abnormality or whether the statistical data happened to coincide with special circumstances at that time.

# 4. Length of Hospital Stay
admissions_tble$los_days <- as.numeric(difftime(ymd_hms(admissions_tble$dischtime),
                                                ymd_hms(admissions_tble$admittime),
                                                units="days"))
Warning: 613 failed to parse.
Warning: 10925 failed to parse.
hospital_stay_distribution <- ggplot(admissions_tble, aes(x=los_days)) +
  geom_histogram(fill="green", color="black", binwidth=1) +
  labs(title="Length of Hospital Stay Distribution",
       x="Length of Stay (Days)",
       y="Count") +
  theme_minimal() +
  xlim(c(0, 30)) # Limiting to 30 days for clearer visualization
print(hospital_stay_distribution)
Warning: Removed 16150 rows containing non-finite values (`stat_bin()`).
Warning: Removed 2 rows containing missing values (`geom_bar()`).

The number of days spent in the hospital also conforms to the actual pattern. Most people with minor illnesses can be discharged on the same day, while some need to be observed in the hospital for about 7 days, and this group is also quite large. The number of people who need to stay for more than 10 days drops sharply, indicating that they may have serious underlying diseases and require longer treatment.

Q4. patients data

Patient information is available in patients.csv.gz. See https://mimic.mit.edu/docs/iv/modules/hosp/patients/ for details of each field in this file. The first 10 lines are

zcat < ~/mimic/hosp/patients.csv.gz | head
subject_id,gender,anchor_age,anchor_year,anchor_year_group,dod
10000032,F,52,2180,2014 - 2016,2180-09-09
10000048,F,23,2126,2008 - 2010,
10000068,F,19,2160,2008 - 2010,
10000084,M,72,2160,2017 - 2019,2161-02-13
10000102,F,27,2136,2008 - 2010,
10000108,M,25,2163,2014 - 2016,
10000115,M,24,2154,2017 - 2019,
10000117,F,48,2174,2008 - 2010,
10000178,F,59,2157,2017 - 2019,

Q4.1 Ingestion

Import patients.csv.gz (https://mimic.mit.edu/docs/iv/modules/hosp/patients/) as a tibble patients_tble.

patients_tble <- read_csv("~/mimic/hosp/patients.csv.gz")

Q4.2 Summary and visualization

# Visualization for Gender Distribution
ggplot(patients_tble, aes(x=gender)) +
  geom_bar(fill="skyblue", color="black") +
  labs(title="Gender Distribution of Patients",
       x="Gender",
       y="Count") +
  theme_minimal()

Male patients are slightly more numerous than female patients.

# Visualization for Anchor Age Distribution
ggplot(patients_tble, aes(x=anchor_age)) +
  geom_histogram(binwidth=5, fill="plum", color="black") +
  labs(title="Anchor Age Distribution of Patients",
       x="Anchor Age",
       y="Frequency") +
  theme_minimal()

Observations show that the frequency of hospital visits is higher among people aged 25-30, with a similar frequency for those aged 30-60. As age increases beyond 60, the number of visits decreases. This may be related to the demographic structure.

Q5. Lab results

labevents.csv.gz (https://mimic.mit.edu/docs/iv/modules/hosp/labevents/) contains all laboratory measurements for patients. The first 10 lines are

zcat < ~/mimic/hosp/labevents.csv.gz | head
labevent_id,subject_id,hadm_id,specimen_id,itemid,order_provider_id,charttime,storetime,value,valuenum,valueuom,ref_range_lower,ref_range_upper,flag,priority,comments
1,10000032,,45421181,51237,P28Z0X,2180-03-23 11:51:00,2180-03-23 15:15:00,1.4,1.4,,0.9,1.1,abnormal,ROUTINE,
2,10000032,,45421181,51274,P28Z0X,2180-03-23 11:51:00,2180-03-23 15:15:00,___,15.1,sec,9.4,12.5,abnormal,ROUTINE,VERIFIED.
3,10000032,,52958335,50853,P28Z0X,2180-03-23 11:51:00,2180-03-25 11:06:00,___,15,ng/mL,30,60,abnormal,ROUTINE,NEW ASSAY IN USE ___: DETECTS D2 AND D3 25-OH ACCURATELY.
4,10000032,,52958335,50861,P28Z0X,2180-03-23 11:51:00,2180-03-23 16:40:00,102,102,IU/L,0,40,abnormal,ROUTINE,
5,10000032,,52958335,50862,P28Z0X,2180-03-23 11:51:00,2180-03-23 16:40:00,3.3,3.3,g/dL,3.5,5.2,abnormal,ROUTINE,
6,10000032,,52958335,50863,P28Z0X,2180-03-23 11:51:00,2180-03-23 16:40:00,109,109,IU/L,35,105,abnormal,ROUTINE,
7,10000032,,52958335,50864,P28Z0X,2180-03-23 11:51:00,2180-03-23 16:40:00,___,8,ng/mL,0,8.7,,ROUTINE,MEASURED BY ___.
8,10000032,,52958335,50868,P28Z0X,2180-03-23 11:51:00,2180-03-23 16:40:00,12,12,mEq/L,8,20,,ROUTINE,
9,10000032,,52958335,50878,P28Z0X,2180-03-23 11:51:00,2180-03-23 16:40:00,143,143,IU/L,0,40,abnormal,ROUTINE,

d_labitems.csv.gz (https://mimic.mit.edu/docs/iv/modules/hosp/d_labitems/) is the dictionary of lab measurements.

zcat < ~/mimic/hosp/d_labitems.csv.gz | head
itemid,label,fluid,category
50801,Alveolar-arterial Gradient,Blood,Blood Gas
50802,Base Excess,Blood,Blood Gas
50803,"Calculated Bicarbonate, Whole Blood",Blood,Blood Gas
50804,Calculated Total CO2,Blood,Blood Gas
50805,Carboxyhemoglobin,Blood,Blood Gas
50806,"Chloride, Whole Blood",Blood,Blood Gas
50808,Free Calcium,Blood,Blood Gas
50809,Glucose,Blood,Blood Gas
50810,"Hematocrit, Calculated",Blood,Blood Gas

We are interested in the lab measurements of creatinine (50912), potassium (50971), sodium (50983), chloride (50902), bicarbonate (50882), hematocrit (51221), white blood cell count (51301), and glucose (50931). Retrieve a subset of labevents.csv.gz that only containing these items for the patients in icustays_tble. Further restrict to the last available measurement (by storetime) before the ICU stay. The final labevents_tble should have one row per ICU stay and columns for each lab measurement.

# read 
labevents_pq <- open_dataset("~/mimic/hosp/labevents_pq", format = "parquet")

specific_items <- c(50912, 50971, 50983, 50902, 50882, 51221, 51301, 50931)
filtered_labevents <- labevents_pq %>%
  filter(itemid %in% specific_items) %>%
  collect()

last_labevents_before_icu <- filtered_labevents %>%
  inner_join(icustays_tble, by = c("subject_id", "hadm_id")) %>%
  group_by(subject_id, stay_id) %>%
  slice_max(order_by = storetime, n = 1) %>%
  ungroup()
Warning in inner_join(., icustays_tble, by = c("subject_id", "hadm_id")): Detected an unexpected many-to-many relationship between `x` and `y`.
ℹ Row 3927 of `x` matches multiple rows in `y`.
ℹ Row 5 of `y` matches multiple rows in `x`.
ℹ If a many-to-many relationship is expected, set `relationship =
  "many-to-many"` to silence this warning.
final_labevents_tble <- last_labevents_before_icu %>%
  pivot_wider(names_from = itemid, values_from = value, names_prefix = "lab_") %>%
  select(subject_id, stay_id, lab_50912, lab_50971, lab_50983, lab_50902, lab_50882, lab_51221, lab_51301, lab_50931)

final_labevents_tble <- final_labevents_tble %>%
  rename(
    creatinine = lab_50912,
    potassium = lab_50971,
    sodium = lab_50983,
    chloride = lab_50902,
    bicarbonate = lab_50882,
    hematocrit = lab_51221,
    white_blood_cell_count = lab_51301,
    glucose = lab_50931
  )

final_labevents_tble_combined <- final_labevents_tble %>%
  group_by(subject_id, stay_id) %>%
  summarise(
    creatinine = max(creatinine, na.rm = TRUE),
    potassium = max(potassium, na.rm = TRUE),
    sodium = max(sodium, na.rm = TRUE),
    chloride = max(chloride, na.rm = TRUE),
    bicarbonate = max(bicarbonate, na.rm = TRUE),
    hematocrit = max(hematocrit, na.rm = TRUE),
    white_blood_cell_count = max(white_blood_cell_count, na.rm = TRUE),
    glucose = max(glucose, na.rm = TRUE),
    .groups = 'drop'
  )
Warning: There were 212457 warnings in `summarise()`.
The first warning was:
ℹ In argument: `creatinine = max(creatinine, na.rm = TRUE)`.
ℹ In group 4: `subject_id = 10001217` and `stay_id = 37067082`.
Caused by warning in `max()`:
! no non-missing arguments, returning NA
ℹ Run `dplyr::last_dplyr_warnings()` to see the 212456 remaining warnings.
print(final_labevents_tble_combined)
# A tibble: 72,436 × 10
   subject_id  stay_id creatinine potassium sodium chloride bicarbonate
        <dbl>    <dbl> <chr>      <chr>     <chr>  <chr>    <chr>      
 1   10000032 39553978 0.4        5.2       130    97       27         
 2   10000980 39765666 2.2        4.5       142    108      24         
 3   10001217 34592300 0.4        4.3       141    103      27         
 4   10001217 37067082 <NA>       <NA>      <NA>   <NA>     <NA>       
 5   10001725 31205490 0.9        3.7       140    100      32         
 6   10001884 37510196 0.6        4.2       138    97       37         
 7   10002013 39060235 <NA>       4.1       137    97       <NA>       
 8   10002155 31090461 2.4        4.4       136    100      25         
 9   10002155 32358465 <NA>       <NA>      125    <NA>     <NA>       
10   10002155 33685454 1.0        4.9       133    98       28         
# ℹ 72,426 more rows
# ℹ 3 more variables: hematocrit <chr>, white_blood_cell_count <chr>,
#   glucose <chr>

Hint: Use the Parquet format you generated in Homework 2. For reproducibility, make labevents_pq folder available at the current working directory hw3, for example, by a symbolic link.

Q6. Vitals from charted events

chartevents.csv.gz (https://mimic.mit.edu/docs/iv/modules/icu/chartevents/) contains all the charted data available for a patient. During their ICU stay, the primary repository of a patient’s information is their electronic chart. The itemid variable indicates a single measurement type in the database. The value variable is the value measured for itemid. The first 10 lines of chartevents.csv.gz are

zcat < ~/mimic/icu/chartevents.csv.gz | head
subject_id,hadm_id,stay_id,caregiver_id,charttime,storetime,itemid,value,valuenum,valueuom,warning
10000032,29079034,39553978,47007,2180-07-23 21:01:00,2180-07-23 22:15:00,220179,82,82,mmHg,0
10000032,29079034,39553978,47007,2180-07-23 21:01:00,2180-07-23 22:15:00,220180,59,59,mmHg,0
10000032,29079034,39553978,47007,2180-07-23 21:01:00,2180-07-23 22:15:00,220181,63,63,mmHg,0
10000032,29079034,39553978,47007,2180-07-23 22:00:00,2180-07-23 22:15:00,220045,94,94,bpm,0
10000032,29079034,39553978,47007,2180-07-23 22:00:00,2180-07-23 22:15:00,220179,85,85,mmHg,0
10000032,29079034,39553978,47007,2180-07-23 22:00:00,2180-07-23 22:15:00,220180,55,55,mmHg,0
10000032,29079034,39553978,47007,2180-07-23 22:00:00,2180-07-23 22:15:00,220181,62,62,mmHg,0
10000032,29079034,39553978,47007,2180-07-23 22:00:00,2180-07-23 22:15:00,220210,20,20,insp/min,0
10000032,29079034,39553978,47007,2180-07-23 22:00:00,2180-07-23 22:15:00,220277,95,95,%,0

d_items.csv.gz (https://mimic.mit.edu/docs/iv/modules/icu/d_items/) is the dictionary for the itemid in chartevents.csv.gz.

zcat < ~/mimic/icu/d_items.csv.gz | head
itemid,label,abbreviation,linksto,category,unitname,param_type,lownormalvalue,highnormalvalue
220001,Problem List,Problem List,chartevents,General,,Text,,
220003,ICU Admission date,ICU Admission date,datetimeevents,ADT,,Date and time,,
220045,Heart Rate,HR,chartevents,Routine Vital Signs,bpm,Numeric,,
220046,Heart rate Alarm - High,HR Alarm - High,chartevents,Alarms,bpm,Numeric,,
220047,Heart Rate Alarm - Low,HR Alarm - Low,chartevents,Alarms,bpm,Numeric,,
220048,Heart Rhythm,Heart Rhythm,chartevents,Routine Vital Signs,,Text,,
220050,Arterial Blood Pressure systolic,ABPs,chartevents,Routine Vital Signs,mmHg,Numeric,90,140
220051,Arterial Blood Pressure diastolic,ABPd,chartevents,Routine Vital Signs,mmHg,Numeric,60,90
220052,Arterial Blood Pressure mean,ABPm,chartevents,Routine Vital Signs,mmHg,Numeric,,

We are interested in the vitals for ICU patients: heart rate (220045), systolic non-invasive blood pressure (220179), diastolic non-invasive blood pressure (220180), body temperature in Fahrenheit (223761), and respiratory rate (220210). Retrieve a subset of chartevents.csv.gz only containing these items for the patients in icustays_tble. Further restrict to the first vital measurement within the ICU stay. The final chartevents_tble should have one row per ICU stay and columns for each vital measurement.

# read 
chartevents_pq <- open_dataset("~/mimic/icu/chartevents_pq", format = "parquet")

specific_items <- c(220045, 220179, 220180, 223761, 220210)
filtered_chartevents <- chartevents_pq %>%
  filter(itemid %in% specific_items) %>%
  collect()

chartevents_icu <- filtered_chartevents %>%
  inner_join(icustays_tble, by = c("subject_id", "hadm_id","stay_id")) %>%
  group_by(subject_id,hadm_id,stay_id) %>%
  slice_max(order_by = storetime, n = 1) %>%
  ungroup()

chartevents_icu_select <- chartevents_icu %>%
  pivot_wider(names_from = itemid, values_from = value, names_prefix = "char_") %>%
  select(subject_id, stay_id,char_220045, char_220179, char_220180, char_223761, char_220210)

chartevents_icu_select <- chartevents_icu_select %>%
  rename(
    heartrate = char_220045,
    systolic_non_invasive_blood_pressure = char_220179,
    diastolic_non_invasive_blood_pressure = char_220180,
    temperature_Fahrenheit = char_223761,
    respiratory_rate = char_220210
  )


final_chartevents_icu_combined <- chartevents_icu_select %>%
  group_by(subject_id, stay_id) %>%
  summarise(
    heartrate = max(heartrate, na.rm = TRUE),
    systolic_non_invasive_blood_pressure = max(systolic_non_invasive_blood_pressure, na.rm = TRUE),
    diastolic_non_invasive_blood_pressure = max(diastolic_non_invasive_blood_pressure, na.rm = TRUE),
    temperature_Fahrenheit = max(temperature_Fahrenheit, na.rm = TRUE),
    respiratory_rate = max(respiratory_rate, na.rm = TRUE),
    .groups = 'drop'
  )
Warning: There were 127136 warnings in `summarise()`.
The first warning was:
ℹ In argument: `heartrate = max(heartrate, na.rm = TRUE)`.
ℹ In group 4: `subject_id = 10001217` and `stay_id = 37067082`.
Caused by warning in `max()`:
! no non-missing arguments, returning NA
ℹ Run `dplyr::last_dplyr_warnings()` to see the 127135 remaining warnings.
print(final_chartevents_icu_combined)
# A tibble: 73,164 × 7
   subject_id  stay_id heartrate systolic_non_invasive_…¹ diastolic_non_invasi…²
        <dbl>    <dbl> <chr>     <chr>                    <chr>                 
 1   10000032 39553978 94        85                       59                    
 2   10000980 39765666 69        131                      69                    
 3   10001217 34592300 80        107                      78                    
 4   10001217 37067082 <NA>      <NA>                     <NA>                  
 5   10001725 31205490 73        91                       58                    
 6   10001884 37510196 74        86                       48                    
 7   10002013 39060235 94        106                      60                    
 8   10002155 31090461 <NA>      <NA>                     <NA>                  
 9   10002155 32358465 75        <NA>                     <NA>                  
10   10002155 33685454 106       138                      66                    
# ℹ 73,154 more rows
# ℹ abbreviated names: ¹​systolic_non_invasive_blood_pressure,
#   ²​diastolic_non_invasive_blood_pressure
# ℹ 2 more variables: temperature_Fahrenheit <chr>, respiratory_rate <chr>

Q7. Putting things together

Let us create a tibble mimic_icu_cohort for all ICU stays, where rows are all ICU stays of adults (age at intime >= 18) and columns contain at least following variables

all variables in icustays_tble all variables in admissions_tble all variables in patients_tble the last lab measurements before the ICU stay in labevents_tble the first vital measurements during the ICU stay in chartevents_tble The final mimic_icu_cohort should have one row per ICU stay and columns for each variable.

# Step 1: Join ICU Stays, Admissions, and Patients Data
icu_admissions_patients <- icustays_tble %>%
  inner_join(admissions_tble, by = c("subject_id", "hadm_id")) %>%
  inner_join(patients_tble, by = "subject_id")

# Step 2: Filter Adults Based on Age at Intime
icu_adults <- icu_admissions_patients %>%
  filter(anchor_age >= 18)

# Step 3: Incorporate Last Lab Measurements Before ICU Stay

icu_adults_lab <- icu_adults %>%
  left_join(final_labevents_tble_combined, by = c("subject_id", "stay_id"))

# Step 4: Incorporate First Vital Measurements During ICU Stay
final_mimic_icu_cohort <- icu_adults_lab %>%
  left_join(final_chartevents_icu_combined, by = c("subject_id", "stay_id"))

print(final_mimic_icu_cohort)
# A tibble: 73,181 × 43
   subject_id  hadm_id  stay_id first_careunit last_careunit intime             
        <dbl>    <dbl>    <dbl> <chr>          <chr>         <dttm>             
 1   10000032 29079034 39553978 Medical Inten… Medical Inte… 2180-07-23 14:00:00
 2   10000980 26913865 39765666 Medical Inten… Medical Inte… 2189-06-27 08:42:00
 3   10001217 24597018 37067082 Surgical Inte… Surgical Int… 2157-11-20 19:18:02
 4   10001217 27703517 34592300 Surgical Inte… Surgical Int… 2157-12-19 15:42:24
 5   10001725 25563031 31205490 Medical/Surgi… Medical/Surg… 2110-04-11 15:52:22
 6   10001884 26184834 37510196 Medical Inten… Medical Inte… 2131-01-11 04:20:05
 7   10002013 23581541 39060235 Cardiac Vascu… Cardiac Vasc… 2160-05-18 10:00:53
 8   10002155 20345487 32358465 Medical Inten… Medical Inte… 2131-03-09 21:33:00
 9   10002155 23822395 33685454 Coronary Care… Coronary Car… 2129-08-04 12:45:00
10   10002155 28994087 31090461 Medical/Surgi… Medical/Surg… 2130-09-24 00:50:00
# ℹ 73,171 more rows
# ℹ 37 more variables: outtime <dttm>, los <dbl>, admittime <dttm>,
#   dischtime <dttm>, deathtime <dttm>, admission_type <chr>,
#   admit_provider_id <chr>, admission_location <chr>,
#   discharge_location <chr>, insurance <chr>, language <chr>,
#   marital_status <chr>, race <chr>, edregtime <dttm>, edouttime <dttm>,
#   hospital_expire_flag <dbl>, admission_hour <int>, admission_minute <int>, …

Q8. Exploratory data analysis (EDA)

Summarize the following information about the ICU stay cohort mimic_icu_cohort using appropriate numerics or graphs:

Length of ICU stay los vs demographic variables (race, insurance, marital_status, gender, age at intime)

mimic_icu_cohort <- final_mimic_icu_cohort %>%
  mutate(age_at_intime = anchor_age) %>%
  filter(age_at_intime >= 18)  

# Stay vs race
ggplot(mimic_icu_cohort, aes(x = race, y = los)) +
  geom_boxplot() +
  theme_minimal() +
  labs(title = "Length of ICU Stay by Race", x = "Race", y = "Length of Stay (days)")

# Stay vs. Insurance
ggplot(mimic_icu_cohort, aes(x = insurance, y = los)) +
  geom_boxplot() +
  theme_minimal() +
  labs(title = "Length of ICU Stay by Insurance Type", x = "Insurance Type", y = "Length of Stay (days)") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))  # Rotate x labels for better readability

# Stay vs. Marital Status
ggplot(mimic_icu_cohort, aes(x = marital_status, y = los)) +
  geom_boxplot() +
  theme_minimal() +
  labs(title = "Length of ICU Stay by Marital Status", x = "Marital Status", y = "Length of Stay (days)") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

#Stay vs. Gender
ggplot(mimic_icu_cohort, aes(x = gender, y = los)) +
  geom_boxplot() +
  theme_minimal() +
  labs(title = "Length of ICU Stay by Gender", x = "Gender", y = "Length of Stay (days)")

# Stay vs. Age at Intime
ggplot(mimic_icu_cohort, aes(x = age_at_intime, y = los)) +
  geom_point(alpha = 0.4) +
  geom_smooth(method = "loess", color = "blue", se = FALSE) +  # Using LOESS for non-linear trend
  theme_minimal() +
  labs(title = "Length of ICU Stay by Age at Intime", x = "Age at Intime", y = "Length of Stay (days)")

Length of ICU stay los vs the last available lab measurements before ICU stay

# LOS vs. Creatinine 
ggplot(mimic_icu_cohort, aes(x = creatinine, y = los)) +
  geom_point(alpha = 0.4) +
  geom_smooth(method = "lm", color = "blue", se = FALSE) +
  theme_minimal() +
  labs(title = "LOS vs Last Creatinine Measurement", x = "Creatinine", y = "Length of Stay (days)")

# LOS vs. Heart Rate
ggplot(mimic_icu_cohort, aes(x = heartrate, y = los)) +
  geom_point(alpha = 0.4) +
  geom_smooth(method = "lm", color = "red", se = FALSE) +
  theme_minimal() +
  labs(title = "LOS vs First Heart Rate Measurement", x = "Heart Rate", y = "Length of Stay (days)")

# LOS vs. Potassium

ggplot(mimic_icu_cohort, aes(x = potassium, y = los)) +
  geom_point(alpha = 0.4) +
  geom_smooth(method = "loess", color = "blue", se = FALSE) +
  theme_minimal() +
  labs(title = "Length of ICU Stay vs. Last Potassium Measurement", x = "Potassium", y = "Length of Stay (days)")

# LOS vs. Sodium
 ggplot(mimic_icu_cohort, aes(x = sodium, y = los)) +
  geom_point(alpha = 0.4) +
  geom_smooth(method = "loess", color = "blue", se = FALSE) +
  theme_minimal() +
  labs(title = "Length of ICU Stay vs. Last Sodium Measurement", x = "Sodium", y = "Length of Stay (days)")

# LOS vs. Chloride
ggplot(mimic_icu_cohort, aes(x = chloride, y = los)) +
  geom_point(alpha = 0.4) +
  geom_smooth(method = "loess", color = "blue", se = FALSE) +
  theme_minimal() +
  labs(title = "Length of ICU Stay vs. Last Chloride Measurement", x = "Chloride", y = "Length of Stay (days)")

# LOS vs. Bicarbonate
ggplot(mimic_icu_cohort, aes(x = bicarbonate, y = los)) +
  geom_point(alpha = 0.4) +
  geom_smooth(method = "loess", color = "blue", se = FALSE) +
  theme_minimal() +
  labs(title = "Length of ICU Stay vs. Last Bicarbonate Measurement", x = "Bicarbonate", y = "Length of Stay (days)")

# LOS vs. Hematocrit
ggplot(mimic_icu_cohort, aes(x = hematocrit, y = los)) +
  geom_point(alpha = 0.4) +
  geom_smooth(method = "loess", color = "blue", se = FALSE) +
  theme_minimal() +
  labs(title = "Length of ICU Stay vs. Last Hematocrit Measurement", x = "Hematocrit", y = "Length of Stay (days)")

# LOS vs. White Blood Cell Count
ggplot(mimic_icu_cohort, aes(x = white_blood_cell_count, y = los)) +
  geom_point(alpha = 0.4) +
  geom_smooth(method = "loess", color = "blue", se = FALSE) +
  theme_minimal() +
  labs(title = "Length of ICU Stay vs. Last White Blood Cell Count", x = "White Blood Cell Count", y = "Length of Stay (days)")

# LOS vs. Glucose
ggplot(mimic_icu_cohort, aes(x = glucose, y = los)) +
  geom_point(alpha = 0.4) +
  geom_smooth(method = "loess", color = "blue", se = FALSE) +
  theme_minimal() +
  labs(title = "Length of ICU Stay vs. Last Glucose Measurement", x = "Glucose", y = "Length of Stay (days)")+
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

Length of ICU stay los vs the first vital measurements within the ICU stay

# LOS vs Heart Rate
ggplot(mimic_icu_cohort, aes(x = heartrate, y = los)) +
  geom_point(alpha = 0.4) +
  geom_smooth(method = "lm", color = "red", se = FALSE) +
  theme_minimal() +
  labs(title = "LOS vs First Heart Rate Measurement", x = "Heart Rate", y = "Length of Stay (days)")+
  theme(legend.position = "none",axis.text.x = element_text(angle = 45, hjust = 1))

# vs. Systolic Non-Invasive Blood Pressure
ggplot(mimic_icu_cohort, aes(x = systolic_non_invasive_blood_pressure, y = los)) +
  geom_point(aes(color = systolic_non_invasive_blood_pressure), alpha = 0.6) +
  geom_smooth(method = "lm", se = FALSE, color = "blue") +
  labs(title = "LOS vs First Systolic Blood Pressure Measurement",
       x = "Systolic Blood Pressure (mmHg)", 
       y = "Length of Stay (days)") +
  theme_minimal()+
  theme(legend.position = "none")

# vs. Diastolic Non-Invasive Blood Pressure
ggplot(mimic_icu_cohort, aes(x = diastolic_non_invasive_blood_pressure, y = los)) +
  geom_point(aes(color = diastolic_non_invasive_blood_pressure), alpha = 0.6) +
  geom_smooth(method = "lm", se = FALSE, color = "blue") +
  labs(title = "LOS vs First Diastolic Blood Pressure Measurement",
       x = "Diastolic Blood Pressure (mmHg)", 
       y = "Length of Stay (days)") +
  theme_minimal()+
  theme(legend.position = "none")

# vs Temperature 
ggplot(mimic_icu_cohort, aes(x = temperature_Fahrenheit, y = los)) +
  geom_point(aes(color = temperature_Fahrenheit), alpha = 0.6) +
  geom_smooth(method = "lm", se = FALSE, color = "red") +
  labs(title = "LOS vs First Temperature Measurement",
       x = "Temperature (°F)", 
       y = "Length of Stay (days)") +
  theme_minimal()+
  theme(legend.position = "none")

# vs Respiratory Rate
ggplot(mimic_icu_cohort, aes(x = respiratory_rate, y = los)) +
  geom_point(aes(color = respiratory_rate), alpha = 0.6) +
  geom_smooth(method = "lm", se = FALSE, color = "green") +
  labs(title = "LOS vs First Respiratory Rate Measurement",
       x = "Respiratory Rate (breaths/min)", 
       y = "Length of Stay (days)") +
  theme_minimal()+
  theme(axis.text.x = element_text(angle = 45, hjust = 1),
  legend.position = "none"
  )

Length of ICU stay los vs first ICU unit

ggplot(mimic_icu_cohort, aes(x = first_careunit, y = los)) +
  geom_boxplot() +
  theme_minimal() +
  labs(title = "Length of ICU Stay by First ICU Unit", x = "First ICU Unit", y = "Length of Stay (days)")+
  theme(axis.text.x = element_text(angle = 45, hjust = 1))